{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import PDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Structure exists: './pdb1tup.ent' \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'./pdb1tup.ent'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#XXX\n",
    "repository = PDB.PDBList()\n",
    "repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "rec_types = {\n",
    "    #single line\n",
    "    'HEADER': [(str, 11, 49), (str, 50, 58), (str, 62, 65)],\n",
    "    #multi_line\n",
    "    'SOURCE': [(int, 7, 9), (str, 10, 78)],\n",
    "    #multi_rec\n",
    "    'LINK' : [(str, 12, 15), (str, 16, 16), (str, 17, 19), (str, 21, 21), (int, 22, 25),\n",
    "              (str, 26, 26), (str, 42, 45), (str, 46, 46), (str, 47, 49), (str, 51, 51),\n",
    "              (int, 52, 55), (str, 56, 56), (str, 59, 64), (str, 66, 71), (float, 73, 77)],\n",
    "    'HELIX': [(int, 7, 9), (str, 11, 13), (str, 15, 17), (str, 19, 19), (int, 21, 24),\n",
    "              (str, 25, 25), (str, 27, 29), (str, 31, 31),\n",
    "              (int, 33, 36), (str, 37 ,37), (int, 38, 39), (str, 40, 69), (int, 71, 75)],\n",
    "    'SHEET': [(int, 7, 9), (str, 11, 13), (int, 14, 15), (str, 17, 19), (str, 21, 21),\n",
    "              (int, 22, 24), (str, 26, 26), (str, 28, 30),\n",
    "              (str, 32, 32), (int, 33, 36), (str, 37, 37), (int, 38, 39), (str, 41, 44),\n",
    "              (str, 45, 47), (str, 49, 49), (int, 50, 53), (str, 54, 54), (str, 56, 59),\n",
    "              (str, 60, 62), (str, 64, 64), (int, 65, 68), (str, 69, 69)],\n",
    "}\n",
    "\n",
    "def parse_pdb(hdl):\n",
    "    for line in hdl:\n",
    "        line = line[:-1]  # remove \\n but not other whitespace\n",
    "        toks = []\n",
    "        for section, elements in rec_types.items():\n",
    "            if line.startswith(section):\n",
    "                for fun, start, end in elements:\n",
    "                    try:\n",
    "                        toks.append(fun(line[start: end + 1]))\n",
    "                    except ValueError:\n",
    "                        toks.append(None)  # eg continuation\n",
    "                yield (section, toks)\n",
    "        if len(toks) == 0:\n",
    "            yield ('UNKNOWN', line)\n",
    "                "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('HEADER', ['NTITUMOR PROTEIN/DNA                   ', '11-JUL-95', '1TUP'])\n",
      "('SOURCE', [None, 'MOL_ID: 1;                                                           '])\n",
      "('HELIX', [1, '  1', 'SER', 'A', 166, ' ', 'HIS', 'A', 168, ' ', 5, '                              ', 3])\n",
      "('SHEET', [1, '  A', 4, 'ARG', 'A', 11, ' ', 'GLY', 'A', 112, ' ', 0, '    ', '   ', ' ', None, ' ', '    ', '   ', ' ', None, ' '])\n",
      "('LINK', ['ZN  ', ' ', ' ZN', 'A', 951, ' ', ' SG ', ' ', 'CYS', 'A', 176, ' ', '  1555', '  1555', 2.33])\n"
     ]
    }
   ],
   "source": [
    "hdl = open('pdb1tup.ent')\n",
    "done_rec = set()\n",
    "for rec in parse_pdb(hdl):\n",
    "    if rec[0] == 'UNKNOWN' or rec[0] in done_rec:\n",
    "        continue\n",
    "    print(rec)\n",
    "    done_rec.add(rec[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "multi_lines = ['SOURCE']\n",
    "\n",
    "#assume multi is just a string\n",
    "def process_multi_lines(hdl):\n",
    "    current_multi = ''\n",
    "    current_multi_name = None\n",
    "    for rec_type, toks in parse_pdb(hdl):\n",
    "        if current_multi_name is not None and current_multi_name != rec_type:\n",
    "            yield current_multi_name, [current_multi]\n",
    "            current_multi = ''\n",
    "            current_multi_name = None\n",
    "        if rec_type in multi_lines:\n",
    "            current_multi += toks[1].strip().rstrip() + ' '\n",
    "            current_multi_name = rec_type\n",
    "        else:\n",
    "            if len(current_multi) != 0:\n",
    "                yield current_multi_name, [current_multi]\n",
    "                current_multi = ''\n",
    "                current_multi_name = None                \n",
    "            yield rec_type, toks\n",
    "    if len(current_multi) != 0:\n",
    "        yield current_multi_name, [current_multi]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('HEADER', ['NTITUMOR PROTEIN/DNA                   ', '11-JUL-95', '1TUP'])\n",
      "('SOURCE', ['MOL_ID: 1; SYNTHETIC: YES; MOL_ID: 2; SYNTHETIC: YES; MOL_ID: 3; ORGANISM_SCIENTIFIC: HOMO SAPIENS; ORGANISM_COMMON: HUMAN; ORGANISM_TAXID: 9606; CELL_LINE: A431; CELL: HUMAN VULVA CARCINOMA; EXPRESSION_SYSTEM: ESCHERICHIA COLI; EXPRESSION_SYSTEM_TAXID: 562; EXPRESSION_SYSTEM_PLASMID: PET3D '])\n",
      "('HELIX', [1, '  1', 'SER', 'A', 166, ' ', 'HIS', 'A', 168, ' ', 5, '                              ', 3])\n",
      "('SHEET', [1, '  A', 4, 'ARG', 'A', 11, ' ', 'GLY', 'A', 112, ' ', 0, '    ', '   ', ' ', None, ' ', '    ', '   ', ' ', None, ' '])\n",
      "('LINK', ['ZN  ', ' ', ' ZN', 'A', 951, ' ', ' SG ', ' ', 'CYS', 'A', 176, ' ', '  1555', '  1555', 2.33])\n"
     ]
    }
   ],
   "source": [
    "hdl = open('pdb1tup.ent')\n",
    "done_rec = set()\n",
    "for rec in process_multi_lines(hdl):\n",
    "    if rec[0] == 'UNKNOWN' or rec[0] in done_rec:\n",
    "        continue\n",
    "    print(rec)\n",
    "    done_rec.add(rec[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_spec_list(my_str):\n",
    "    #ignoring escape characters\n",
    "    spec_list = {}\n",
    "    elems = my_str.strip().strip().split(';')\n",
    "    for elem in elems:\n",
    "        toks = elem.split(':')\n",
    "        spec_list[toks[0].strip()] = toks[1].strip()\n",
    "    return spec_list\n",
    "\n",
    "struct_types = {\n",
    "    'SOURCE': [get_spec_list] \n",
    "}\n",
    "\n",
    "def process_struct_types(hdl):\n",
    "    for rec_type, toks in process_multi_lines(hdl):\n",
    "        if rec_type in struct_types.keys():\n",
    "            funs = struct_types[rec_type]\n",
    "            struct_toks = []\n",
    "            for tok, fun in zip(toks, funs):\n",
    "                struct_toks.append(fun(tok))\n",
    "            yield rec_type, struct_toks\n",
    "        else:\n",
    "            yield rec_type, toks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('SOURCE', [{'MOL_ID': '3', 'SYNTHETIC': 'YES', 'ORGANISM_SCIENTIFIC': 'HOMO SAPIENS', 'ORGANISM_COMMON': 'HUMAN', 'ORGANISM_TAXID': '9606', 'CELL_LINE': 'A431', 'CELL': 'HUMAN VULVA CARCINOMA', 'EXPRESSION_SYSTEM': 'ESCHERICHIA COLI', 'EXPRESSION_SYSTEM_TAXID': '562', 'EXPRESSION_SYSTEM_PLASMID': 'PET3D'}])\n"
     ]
    }
   ],
   "source": [
    "hdl = open('pdb1tup.ent')\n",
    "for rec in process_struct_types(hdl):\n",
    "    if rec[0] != 'SOURCE':\n",
    "        continue\n",
    "    print(rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
